#!/usr/bin/env python3
"""
Entrypoint script for running the notability classification pipeline.

This is a thin launcher that imports and calls the stable API from
src.connecting_people.preprocessing.notability_pipeline.
"""
import argparse
import sys
from pathlib import Path

# Add the src directory to the Python path to enable imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from connecting_people.preprocessing.notability_pipeline import run_notability_pipeline


def main():
    """Parse command-line arguments and run the notability pipeline."""
    parser = argparse.ArgumentParser(
        description="Run the notability feature classification pipeline."
    )
    parser.add_argument(
        "input_file",
        type=str,
        help="Path to the input parquet file with notability_features column"
    )
    parser.add_argument(
        "output_file",
        type=str,
        help="Path where the processed parquet file will be written"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        help="HuggingFace model name for sentence embeddings"
    )
    parser.add_argument(
        "--chunksize",
        type=int,
        default=1024,
        help="Number of rows per chunk when streaming the parquet file"
    )
    parser.add_argument(
        "--similarity-threshold",
        type=float,
        default=0.95,
        help="Cosine similarity threshold for clustering (0-1)"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=128,
        help="Batch size for encoding text"
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for reproducibility"
    )
    parser.add_argument(
        "--row-limit",
        type=int,
        default=None,
        help="Optional maximum number of rows to process"
    )

    args = parser.parse_args()

    # Call the pipeline function
    run_notability_pipeline(
        input_file_path=args.input_file,
        output_file_path=args.output_file,
        model_name=args.model_name,
        chunksize=args.chunksize,
        similarity_threshold=args.similarity_threshold,
        batch_size=args.batch_size,
        seed=args.seed,
        row_limit=args.row_limit,
    )

    print(f"Pipeline completed. Output written to: {args.output_file}")


if __name__ == "__main__":
    main()
